# download_tesol.py
# TESOL (International Journal of TESOL Studies) Union Journal Downloader
# Automates downloading PDFs from TESOL Union Journal issues
# - Parses issue index and article pages
# - Extracts direct PDF links from attachments
# - Skips Foreword and Special Issue titles
# - Creates folders named TESOL_<issueID> dynamically

"""
download_tesol_issue.py

TESOL Union Journal Downloader

Features:
- Downloads all PDFs from a TESOL issue.
- Skips articles with titles starting with "Foreword" or "Special Issue".
- Creates folder named TESOL_<issueID> (e.g., TESOL_2023-3).
"""

import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

HEADERS = {"User-Agent": "Mozilla/5.0"}

def sanitize(text):
    return re.sub(r'[\\/*?:"<>|]', "", text).strip()

# Input URL for TESOL issue (e.g., https://www.tesolunion.org/journal/lists/folder/eMTMu0MDQx/)
issue_url = input("Enter TESOL issue URL: ").strip()

resp = requests.get(issue_url, headers=HEADERS)
soup = BeautifulSoup(resp.text, "html.parser")

parsed = urlparse(issue_url)
base_url = f"{parsed.scheme}://{parsed.netloc}"

# Extract issue ID (e.g., 2023-3) from <h5> or fallback to last URL part
title_tag = soup.find("h5")
if title_tag:
    issue_id = sanitize(title_tag.get_text(strip=True))
else:
    issue_id = sanitize(os.path.basename(parsed.path))

folder_name = f"TESOL_{issue_id}"
os.makedirs(folder_name, exist_ok=True)

# Collect article links
article_links = []
for li in soup.find_all("li", class_="title"):
    a = li.find("a", href=True)
    if not a:
        continue
    title = sanitize(a.get_text(strip=True))
    # Skip rules
    if title.lower().startswith("foreword") or title.lower().startswith("special issue"):
        print(f"[SKIP] {title}")
        continue
    article_url = urljoin(base_url, a["href"])
    article_links.append((title, article_url))

print(f"Found {len(article_links)} articles to download")

count = 0

for title, article_url in article_links:
    r = requests.get(article_url, headers=HEADERS)
    art_soup = BeautifulSoup(r.text, "html.parser")

    pdf_link = art_soup.find("a", href=re.compile(r"/attachments/files/.*\.pdf"))
    if not pdf_link:
        print(f"[SKIP] No PDF link found for: {title}")
        continue

    pdf_url = urljoin(base_url, pdf_link["href"])
    filename = os.path.join(folder_name, f"{title}.pdf")

    if os.path.exists(filename):
        print(f"[SKIP] Already downloaded: {title}")
        continue

    print(f"[{count+1}] Downloading: {title}")
    try:
        pdf = requests.get(pdf_url, headers=HEADERS)
        with open(filename, "wb") as f:
            f.write(pdf.content)
        count += 1
        print(f"[OK] Saved: {title}")
    except Exception as e:
        print(f"[ERROR] {title}: {e}")

print(f"\nDone! {count} PDFs saved in {folder_name}")
